# Purpose:
# Replicate the one-tailed Welch t-test, bootstrap CI, and Cohen's d for
# between-group differences in the proportion of within-clause disfluencies.

library(readxl)
library(readr)
library(dplyr)
library(boot)

input_file <- "data/change to data file name"
output_file <- "outputs/04_within_clause_group_comparison.txt"

file_ext <- tools::file_ext(input_file)
if (tolower(file_ext) %in% c("xlsx", "xls")) {
  df <- read_excel(input_file)
} else {
  df <- read_csv(input_file, show_col_types = FALSE)
}

required_cols <- c("group", "prop_w")
missing_cols <- setdiff(required_cols, names(df))
if (length(missing_cols) > 0) {
  stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
}

df <- df %>%
  mutate(group = tolower(as.character(group))) %>%
  mutate(
    group = case_when(
      group %in% c("novice", "student", "students") ~ "novice",
      group %in% c("expert", "therapist", "therapists", "experienced") ~ "expert",
      TRUE ~ group
    )
  ) %>%
  filter(group %in% c("novice", "expert")) %>%
  mutate(group = factor(group, levels = c("novice", "expert")))

t_result <- t.test(prop_w ~ group, data = df, alternative = "greater")

mean_diff <- function(data, indices) {
  d <- data[indices, ]
  mean(d$prop_w[d$group == "novice"]) - mean(d$prop_w[d$group == "expert"])
}

set.seed(123)
boot_out <- boot(data = df, statistic = mean_diff, R = 10000)
boot_ci <- boot.ci(boot_out, type = "perc")

mean_nov <- mean(df$prop_w[df$group == "novice"], na.rm = TRUE)
sd_nov <- sd(df$prop_w[df$group == "novice"], na.rm = TRUE)
n_nov <- sum(df$group == "novice")
mean_exp <- mean(df$prop_w[df$group == "expert"], na.rm = TRUE)
sd_exp <- sd(df$prop_w[df$group == "expert"], na.rm = TRUE)
n_exp <- sum(df$group == "expert")

pooled_sd <- sqrt(((n_nov - 1) * sd_nov^2 + (n_exp - 1) * sd_exp^2) / (n_nov + n_exp - 2))
cohen_d <- (mean_nov - mean_exp) / pooled_sd

sink(output_file)
cat("Within-clause disfluency group comparison\n\n")
cat(sprintf("Novice: M = %.3f, SD = %.3f, n = %d\n", mean_nov, sd_nov, n_nov))
cat(sprintf("Expert: M = %.3f, SD = %.3f, n = %d\n\n", mean_exp, sd_exp, n_exp))
print(t_result)
cat("\nBootstrap percentile CI for novice - expert difference:\n")
print(boot_ci)
cat("\nCohen's d:\n")
print(cohen_d)
sink()
